Loading one-to-one orthologs between P. barbatus and C. floridanus

path_to_pogodata <- paste0(path_to_repo,"/data/gordon_data/")
pogo.cflo <-
  read.csv(paste0(path_to_pogodata, "pogo_cflo_orthologs.csv"),
           header = T, stringsAsFactors = F, na.strings = c(""," ",".", "NA")) %>%
  as_tibble() %>% 
  select(cflo_gene,pogo_gene) %>% 
  distinct()

# make a function that takes pogo names and returns cflo names
pogo_to_cflo <- function(geneset) {
  cflo_genes <- 
    pogo.cflo %>% 
    filter(pogo_gene %in% geneset) %>% 
    pull(cflo_gene) %>% 
    unique() %>% 
    as.character()
  
  return(cflo_genes)
}

# make a function that takes pogo names and returns cflo names
cflo_to_pogo <- function(geneset) {
  pogo_genes <- 
    pogo.cflo %>% 
    filter(cflo_gene %in% geneset) %>% 
    pull(pogo_gene) %>% 
    unique() %>% 
    as.character()
  
  return(pogo_genes)
}

Overview/Goals

Using time-course RNASeq data from healthy Cflo forager heads:

  • build a circadian gene co-expression network (GCN),
  • annotate the network using published data,

Step 1: Build circadian GCN

1.1 Load data

Datasets:

  • Cflo forager ant brains - LD (Das and de Bekker 2022)
# Specify the path to TC5 database
path_to_tc5_repo = "~/Documents/GitHub/Das_et_al_2021"
# Load the TC5 database
tc5.db <- dbConnect(RSQLite::SQLite(), 
                    paste0(path_to_tc5_repo,"/data/TC5_data.db"))

# Specify the path to the result-summary from de Bekker and Das 2021
path_to_tc5_results <- paste0(path_to_tc5_repo, "/results/gene_lists/")
## load genes of interest
cflo.for24nur8 <- read.csv(paste0(path_to_tc5_results, "for24nur8_all.csv")) %>% pull(gene_name) %>% unique()
cflo.for24 <- read.csv(paste0(path_to_tc5_results, "for24_all.csv")) %>% pull(gene_name) %>% unique()
cflo.for12 <- read.csv(paste0(path_to_tc5_results, "for12_all.csv")) %>% pull(gene_name) %>% unique()
cflo.for8 <- read.csv(paste0(path_to_tc5_results, "for8_all.csv")) %>% pull(gene_name) %>% unique()
  • Pogo forager ant brains - LD
  • Pogo forager ant brains - DD
# SAMPLE NAME
## specify sample name
sample.name <- c("pbar_ld","pbar_dd")

## SPECIFY THE DATASET
which.sample <- sample.name[1]

writeLines(paste0("Selected Dataset: ", which.sample))
## Selected Dataset: pbar_ld
# LOAD DATABASES (TC9)
## 1. TC9_ejtk.db
ejtk.db <- dbConnect(RSQLite::SQLite(),
                   paste0(path_to_repo, "/data/databases/TC9_ejtk.db"))

## 2. TC9_data.db
data.db <- dbConnect(RSQLite::SQLite(),
                     paste0(path_to_repo, "/data/databases/TC9_data.db"))


# extract the (gene-expr X time-point-of-sampling) data
dat <-
  data.db %>%
  tbl(., paste0(which.sample ,"_fpkm")) %>%
  select(gene_name, everything()) %>%
  collect()

writeLines(
"What is the dimensions of the original dataset? 
[Rows = #genes, Cols = #samples]"
)
## What is the dimensions of the original dataset? 
## [Rows = #genes, Cols = #samples]
dim(dat[-1])
## [1] 12557    12

1.2 Clean data

The above dataset contains all genes (n=12,557) in the ant genome. However, not all of these genes are expressed in the ant head, and some are expressed at very low levels that are not biologically meaningful. Therefore, we will only keep the genes that are “expressed” (≥1 FPKM for at least half of all time points) in the ant head.

# Which genes are expressed throughout the day in forager heads?
  # count the number of time points that has ≥ 1 FPKM
  # subset the data and only keep the filtered genes

# arguments
min_expression = 1
min_timepoints = 6

dat <- dat |> 
  ## to do ----
  ## convert into function
  mutate(
    n_samples = rowSums(
      across(
        !gene_name,
        ~ . >= min_expression
      ),
      na.rm = TRUE
    )
  ) |> 
  filter(
    n_samples >= min_timepoints
  ) |> 
  select(
    - n_samples
  )

writeLines("Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]")
## Dimensions of the data post-filtering step [Rows = #genes, Cols = #samples]
dim(dat)
## [1] 9925   13

This is our cleaned, input data file for building the circadian GCN.

1.3 Format data

  • Log2 transform the data
datExpr = as.data.frame(t(log2(dat[-c(1)]+1)))
names(datExpr) = dat$gene_name
rownames(datExpr) = names(dat)[-c(1)]

# USE THE FOLLOWING CODE TO CHECK IF YOU HAVE ANY BAD SAMPLES #
  # gsg = goodSamplesGenes(datExpr, verbose = 3);
  # gsg$allOK

  # sampleTree = hclust(dist(datExpr0), method = "average");
  # # Plot the sample tree: Open a graphic output window of size 12 by 9 inches
  # # The user should change the dimensions if the window is too large or too small.
  # sizeGrWindow(12,9)
  # #pdf(file = "Plots/sampleClustering.pdf", width = 12, height = 9);
  # par(cex = 1);
  # par(mar = c(0,4,2,0))
  # plot(sampleTree, main = "Sample clustering to detect outliers", sub="", xlab="", cex.lab = 1.5,
  #      cex.axis = 1.5, cex.main = 2)

# save the number of genes and samples
# that will be used to create the circadian GCN
nGenes = ncol(datExpr)
nSamples = nrow(datExpr)

writeLines("Visualizing the log-transformed data")
## Visualizing the log-transformed data
datExpr |> 
  rownames_to_column(
    "sample"
  ) |> 
  as_tibble() |> 
  tidyr::pivot_longer(
    cols = !sample,
    names_to = "gene_id",
    values_to = "log2_fpkm"
  ) |> 
  mutate(
    sample = stringr::str_remove(sample, "ZT")
  ) |> 
  ggplot(
    aes(
      x = log2_fpkm, 
      # color = sample,
      fill = sample
    )
  ) + 
  geom_density(
    position = "stack"
  ) + 
  theme_Publication(25) +
  scale_fill_manual(
    values = viridis::viridis(nSamples)
  ) +
  theme(
    legend.position = "bottom",
    legend.justification = "right"
  ) +
  guides(
    fill = guide_legend(
      nrow = 3,
      byrow=TRUE
    )
  )

1.4 Calculate gene-gene similarity

# Calculate Kendall's tau-b correlation for each gene-gene pair
# sim_matrix <- cor((datExpr), method = "kendall") # this step takes time
# save(sim_matrix, 
#      file = paste0(path_to_repo, "/results/wgcna_files/ref",
#                    which.sample,"sim_matrix_", sample.name[1], "_TC9.RData")) # might be useful to save the sim_matrix and
load(paste0(
  path_to_repo, 
  "/results/wgcna_files/ref",
  which.sample,
  "sim_matrix_", 
  sample.name[1], 
  "_TC9.RData"
)) # load it up

## Let's display a chunk of the matrix (code from Hughitt 2016; github)
heatmap_indices <- sample(nrow(sim_matrix), 200)

writeLines(paste0("Plotting a chunk of the gene-gene similarity matrix with ", length(heatmap_indices), " genes."))
## Plotting a chunk of the gene-gene similarity matrix with 200 genes.
gplots::heatmap.2(t(sim_matrix[heatmap_indices, heatmap_indices]),
          col=inferno(100),
          labRow=NA, labCol=NA,
          trace='none', dendrogram='row',
          xlab='Gene', ylab='Gene',
          main= paste0("Similarity matrix \n correlation method = 'kendall' \n (", length(heatmap_indices), " random genes)"),
          density.info='none', revC=TRUE)

1.5 Create adjacency matrix

  • To create the adjacency matrix, we need to first identify the soft-thresholding power (see WGCNA for more info).
writeLines("Performing network topology analysis to pick soft-thresholding power")
## Performing network topology analysis to pick soft-thresholding power
# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to=30, by=2))
# # Call the network topology analysis function
sft = pickSoftThreshold(datExpr, powerVector = powers, verbose = 5)
## pickSoftThreshold: will use block size 4507.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 4507 of 9925
##    ..working on genes 4508 through 9014 of 9925
##    ..working on genes 9015 through 9925 of 9925
##    Power SFT.R.sq  slope truncated.R.sq mean.k. median.k. max.k.
## 1      1    0.703  2.540          0.937  3690.0   3800.00   5080
## 2      2    0.274  0.518          0.819  1940.0   2010.00   3330
## 3      3    0.065 -0.192          0.668  1180.0   1200.00   2450
## 4      4    0.350 -0.557          0.731   793.0    776.00   1910
## 5      5    0.505 -0.778          0.783   564.0    526.00   1550
## 6      6    0.579 -0.924          0.821   418.0    371.00   1280
## 7      7    0.616 -1.020          0.844   320.0    269.00   1080
## 8      8    0.636 -1.120          0.851   252.0    200.00    929
## 9      9    0.657 -1.190          0.863   202.0    151.00    806
## 10    10    0.676 -1.230          0.875   165.0    115.00    706
## 11    12    0.707 -1.300          0.892   115.0     70.70    555
## 12    14    0.713 -1.340          0.887    83.2     45.00    447
## 13    16    0.735 -1.350          0.892    62.5     29.60    367
## 14    18    0.764 -1.340          0.900    48.2     20.30    305
## 15    20    0.811 -1.310          0.924    38.0     14.20    258
## 16    22    0.838 -1.310          0.942    30.5     10.20    224
## 17    24    0.819 -1.400          0.937    24.9      7.46    203
## 18    26    0.818 -1.460          0.943    20.6      5.53    186
## 19    28    0.836 -1.480          0.960    17.2      4.14    170
## 20    30    0.853 -1.510          0.971    14.5      3.18    157
writeLines("Plotting the resutls from the network topology analysis")
## Plotting the resutls from the network topology analysis
# Plot the results:
# sizeGrWindow(9, 5)
par(mfrow = c(1,2));
cex1 = 0.9;
# Scale-free topology fit index as a function of the soft-thresholding power
plot(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     xlab="Soft Threshold (power)",ylab="Scale Free Topology Model Fit,signed R^2",type="n",
     main = paste("Scale independence"));
text(sft$fitIndices[,1], -sign(sft$fitIndices[,3])*sft$fitIndices[,2],
     labels=powers,cex=cex1,col="red");
# this line corresponds to using an R^2 cut-off of h
abline(h=0.70,col="red")
# Mean connectivity as a function of the soft-thresholding power
plot(sft$fitIndices[,1], sft$fitIndices[,5],
     xlab="Soft Threshold (power)",ylab="Mean Connectivity", type="n",
     main = paste("Mean connectivity"))
text(sft$fitIndices[,1], sft$fitIndices[,5], labels=powers, cex=cex1,col="red")

par(mfrow = c(1,1));

NOTE: The scale-free topology fit index reaches ~0.70 at a soft-thresholding-power=12.

Now, we can go ahead and create our adjacency matrix by power-transforming the similarity matrix (see WGCNA for more info).

## Specify the soft-thresholding-power
soft.power = 12

# Construct adjacency matrix
# adj_matrix <- adjacency.fromSimilarity(sim_matrix,
#                                        power=soft.power,
#                                        type='signed'
#                                         )
# save(adj_matrix, 
#      file = paste0(path_to_repo, "/results/wgcna_files/ref",
#                    which.sample,"adj_matrix_", sample.name[1], "_TC9.RData")) 

## Delete similarity matrix to free up memory
rm(sim_matrix)
gc()
##            used  (Mb) gc trigger   (Mb) limit (Mb)  max used   (Mb)
## Ncells  5357506 286.2    9813792  524.2         NA   9503946  507.6
## Vcells 10626677  81.1  246884106 1883.6      32768 481798582 3675.9
# might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/wgcna_files/ref",
                   which.sample,"adj_matrix_", sample.name[1], "_TC9.RData")) 
# load it up


# Convert adj_matrix to matrix
gene_ids <- rownames(adj_matrix)

adj_matrix <- matrix(adj_matrix, nrow=nrow(adj_matrix))
rownames(adj_matrix) <- gene_ids
colnames(adj_matrix) <- gene_ids

writeLines(paste0("Plotting the power-transformed adjacency matrix for the same ", length(heatmap_indices)," genes as above"))
## Plotting the power-transformed adjacency matrix for the same 200 genes as above
## Same heatmap as before, but now with the power-transformed adjacency matrix
gplots::heatmap.2(t(adj_matrix[heatmap_indices, heatmap_indices]),
                  col=inferno(100),
                  labRow=NA, labCol=NA,
                  trace='none', dendrogram='row',
                  xlab='Gene', ylab='Gene',
                  main='Adjacency matrix\n(post power transformation)',
                  density.info='none', revC=TRUE)


Step 2: Identify gene clusters

The following steps are performed as per guidelines from the WGCNA package and several tutorials made available online.

2.1 Create topological overalp matrix

# # Turn adjacency into topological overlap
# TOM = TOMsimilarity(adj_matrix);
# dissTOM = 1-TOM
# save(dissTOM, 
#      file = paste0(path_to_repo, "/results/wgcna_files/ref",
#                    which.sample,"dissTOM_", sample.name[1], "_TC9.RData")) # might be useful to save the sim_matrix and
load(paste0(path_to_repo, "/results/wgcna_files/ref",
                   which.sample,"dissTOM_", sample.name[1], "_TC9.RData")) # load it up

# Call the hierarchical clustering function
geneTree = hclust(as.dist(dissTOM), method = "average")

writeLines("Plotting the resulting clustering tree (dendrogram)")
## Plotting the resulting clustering tree (dendrogram)
# sizeGrWindow(12,9)
# reset plotting parameter
par(mfrow = c(1,1))
plot(geneTree, xlab="", sub="", main = "Gene clustering on TOM-based dissimilarity",
     labels = FALSE, hang = 0.04)

2.2 Identify clusters

User defined parameters:

  • minimum size (number of genes) of modules | var-name: minModuleSize
# We like large modules, so we set the minimum module size relatively high:
minModuleSize = 50;

# Module identification using dynamic tree cut:
dynamicMods= cutreeDynamic(dendro = geneTree,
                           distM = dissTOM,
                           method = "hybrid",
                           verbose = 4,
                           deepSplit = 3, 
                           # see WGCNA for more info on tuning parameters
                           pamRespectsDendro = FALSE,
                           minClusterSize = minModuleSize);
##  ..cutHeight not given, setting it to 0.995  ===>  99% of the (truncated) height range in dendro.
##  ..Going through the merge tree
##  
##  ..Going through detected branches and marking clusters..
##  ..Assigning Tree Cut stage labels..
##  ..Assigning PAM stage labels..
##  ....assigned 5159 objects to existing clusters.
##  ..done.
# view number of genes in each module
# table(dynamicMods)

writeLines("How many genes are there in each of the initial modules (clusters) detected?
Note: The names of the modules (colors) have no meaning.")
## How many genes are there in each of the initial modules (clusters) detected?
## Note: The names of the modules (colors) have no meaning.
# Convert numeric lables into colors
dynamicColors = WGCNA::labels2colors(dynamicMods)
table(dynamicColors)
## dynamicColors
##          black           blue          brown           cyan      darkgreen 
##            521            880            745            233            117 
##       darkgrey    darkmagenta darkolivegreen     darkorange        darkred 
##            108             66             66            100            125 
##  darkturquoise          green    greenyellow         grey60      lightcyan 
##            114            620            342            200            201 
##     lightgreen    lightyellow        magenta   midnightblue         orange 
##            166            154            431            205            106 
##  paleturquoise           pink         purple            red      royalblue 
##             72            448            378            585            151 
##    saddlebrown         salmon        sienna3        skyblue      steelblue 
##             88            253             62             88             79 
##            tan      turquoise         violet          white         yellow 
##            273           1068             69             99            657 
##    yellowgreen 
##             55

2.3 Merge similar modules

User defined parameters:

  • minimum correlation between two modules above which they are merged into one | var-name: MEDissThres
# Calculate eigengenes
MEList = WGCNA::moduleEigengenes(datExpr, colors = dynamicColors)
MEs = MEList$eigengenes

# Calculate dissimilarity of module eigengenes
MEDiss = 1-cor(MEs, method = "kendall");

writeLines("Clustering the module eigengenes and identifying a cutoff to merge similar modules")
## Clustering the module eigengenes and identifying a cutoff to merge similar modules
# Cluster module eigengenes
METree = hclust(as.dist(MEDiss), method = "average");
# Plot the result
# sizeGrWindow(7, 8)
plot(METree, main = "Clustering of module eigengenes",
     xlab = "", sub = "MEDiss = 1-cor(MEs, method = 'kendall')")

# We choose a height cut of 0.25, corresponding to correlation of 0.75, to merge
MEDissThres = 0.25 # user-specified parameter value; see WGCNA manual for more info

# Plot the cut line into the dendrogram
abline(h=MEDissThres, col = "red")

writeLines(paste0("Merging modules that have a correlation ≥ ", 1-MEDissThres))
## Merging modules that have a correlation ≥ 0.75
# Call an automatic merging function
merge = WGCNA::mergeCloseModules(datExpr, dynamicColors, cutHeight = MEDissThres, verbose = 3)
##  mergeCloseModules: Merging modules whose distance is less than 0.25
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 36 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 16 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 12 module eigengenes in given set.
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 11 module eigengenes in given set.
##    Calculating new MEs...
##    multiSetMEs: Calculating module MEs.
##      Working on set 1 ...
##      moduleEigengenes: Calculating 11 module eigengenes in given set.
# The merged module colors
mergedColors = merge$colors;
# Eigengenes of the new merged modules:
mergedMEs = merge$newMEs;

writeLines("Plotting the identified clusters (denoted with colors) before and after merging.")
## Plotting the identified clusters (denoted with colors) before and after merging.
# sizeGrWindow(12, 9)
WGCNA::plotDendroAndColors(geneTree,
                    cbind(dynamicColors, mergedColors),
                    c("Dynamic Tree Cut", "Merged dynamic"),
                    dendroLabels = FALSE, hang = 0.03,
                    addGuide = TRUE, guideHang = 0.05)

# Rename to moduleColors
moduleColors = mergedColors

# Construct numerical labels corresponding to the colors
colorOrder = c("grey", 
               WGCNA::standardColors(50));
moduleLabels = match(moduleColors, colorOrder)-1

2.4 Calculate module-module similarity

writeLines("Calculating module-module similarity based on module-eigengene-expression.")
## Calculating module-module similarity based on module-eigengene-expression.
# Calculate similarity of the eigen-genes
sim_matrix_ME <- cor(mergedMEs, method = "kendall")

# calculate adj_matrix
adj_matrix_ME <- WGCNA::adjacency.fromSimilarity(sim_matrix_ME,
                                          power=1, # DO NOT power transform
                                          type='signed'
)

## CHANGE THE NAMES OF THE MODULES;
module_ids <- data.frame(
  old_labels = rownames(adj_matrix_ME) %>% 
    str_split("ME", 2) %>% 
    sapply("[", 2) %>% 
    as.character(),
  new_labels = paste0(
    "C", 
    1:nrow(adj_matrix_ME)
  )
)
# and coerce into matrix
adj_matrix_ME <- matrix(adj_matrix_ME, nrow=nrow(adj_matrix_ME))
rownames(adj_matrix_ME) <- module_ids$new_labels
colnames(adj_matrix_ME) <- module_ids$new_labels

# ## KEEP THE SAME MODULE NAMES (named by color)
# gene_ids <- rownames(adj_matrix_ME)
# # coerce into a matrix
# adj_matrix_ME <- matrix(adj_matrix_ME, nrow=nrow(adj_matrix_ME))
# rownames(adj_matrix_ME) <- gene_ids
# colnames(adj_matrix_ME) <- gene_ids

writeLines("Plotting the adjacency matrix that shows module-module similarity in expression")
## Plotting the adjacency matrix that shows module-module similarity in expression
gplots::heatmap.2(t(adj_matrix_ME),
                  col=inferno(100),
                  # labRow=NA, labCol=NA,
                  trace='none', dendrogram='row',
                  xlab='', ylab='',
                  # main='Similarity matrix - MEs \n correlation method = "kendall")',
                  main='Adjacency matrix - MEs \n (module-module similarity)',
                  density.info='none', revC=TRUE)

2.5 Visualize the network

pacman::p_load(igraph)

adj_matrix_ME_igraph <- adj_matrix_ME

# get rid of low correlations (0.6 & 0.8 are arbitrary) [0.7 and 0.9]
adj_matrix_ME_igraph[adj_matrix_ME_igraph < 0.6] <- 0
adj_matrix_ME_igraph[adj_matrix_ME_igraph < 0.8 & adj_matrix_ME_igraph>0] <- 0.5
adj_matrix_ME_igraph[adj_matrix_ME_igraph >= 0.9] <- 1

# build_network
network <- graph.adjacency(adj_matrix_ME_igraph,
                           mode = "upper",
                           weighted = T,
                           diag = F)

# simplify network
network <- igraph::simplify(network)  # removes self-loops

# remove isolated vertices (keep only the nodes)
isolated <- which(degree(network)==0)
network <- igraph::delete.vertices(network, isolated)

# E(network)$width <- E(network)$weight + min(E(network)$weight) + 1 # offset=1

# colors <- as.character(module_ids$old_labels)
# V(network)$color <- colors
V(network)$color <- "white"

# genes_ME <- factor(moduleColors, levels=colors) %>% summary()
V(network)$size <- igraph::degree(network, mode = "all")*3
# V(network)$size <- log2(genes_ME)^1.3

V(network)$label.color <- "black"
V(network)$frame.color <- "black"

E(network)$width <- E(network)$weight^2*4
E(network)$color <- "black"

# ## highlight shortest paths between two vetices
# short.path <- igraph::get.shortest.paths(network, "S_5", "S_15")
# E(network, path = unlist(short.path[[1]]))$color <- col.scheme[2]
# E(network, path = unlist(short.path[[1]]))$width <- E(network)$weight*8

writeLines("Visualizing a simplified representation of the circadian GCN, with and without labels")
## Visualizing a simplified representation of the circadian GCN, with and without labels
# trash <- dev.off()
par(mfrow = c(1,1))
    
# png(paste0(path_to_repo, "/results/figures/", sample.name[1], "/", script.name,"/", sample.name[1],"_GCN_2.png"),
#     width = 20, height = 30, units = "cm", res = 600)
# par(bg=NA)
plot(network,
     size=20,
     layout=layout.kamada.kawai,
       # layout=layout.fruchterman.reingold,
       # layout=layout.graphopt,
       # layout=layout_in_circle,
     # vertex.label=NA
     vertex.size=hub.score(network)$vector*30
     # vertex.shape="none"
)

# dev.off()

Step 3: Annotate the network

list of module x genes

# Make a list that returns gene names for a given cluster
module_genes <- list()

modules.to.exclude <- c("")
# modules.to.exclude <- c(paste0("C",c(2,5,6,7,10:17,19)))
which.modules <- module_ids %>% filter(!new_labels %in% modules.to.exclude) %>% pull(old_labels)
which.labels <- module_ids %>% filter(!new_labels %in% modules.to.exclude) %>% pull(new_labels)

# Get the genes from each of the modules
for (i in 1:length(which.modules)) {
  
  # which color
  mod.color = as.character(which.modules[[i]])
  
  # subset
  module_genes[[i]] <- names(datExpr)[which(moduleColors==mod.color)]
  names(module_genes)[[i]] <- as.character(which.labels[[i]])
}
# # check the result | works
# names(module_genes)
# module_genes['C22']

# [13 Dec 2021]
# Save a csv with the module identity information for all genes used in building the GCN
# make a dataframe with gene_name and module_identity
for (i in 1:length(module_genes)){
  if (i == 1){
    pbar.ld.mods <- data.frame(gene_name = module_genes[[i]],
                                    module_identity = as.character(names(module_genes)[i]))
  }
  else{
   foo <- data.frame(gene_name = module_genes[[i]],
                                    module_identity = as.character(names(module_genes)[i]))
    pbar.ld.mods <- rbind(pbar.ld.mods, foo) 
  }
  
}

# save the dataframe as a csv
pbar.ld.mods %>%
  left_join(module_ids, by = c("module_identity" = "new_labels")) %>%
  write.csv(.,
            paste0(path_to_repo,
                   "/results/wgcna_files/res/pbar_ld_module_identity_new_labels.csv"),
            row.names = F)
# done.

# # save a copy with all the gene annotations
# # load Cflo gene annotations
# cflo_annots <- read.csv(paste0(path_to_repo,"/functions/func_data/cflo_annots.csv"),
#                             header=T, stringsAsFactors = F, na.strings = c(NA,""," "))
# pbar.ld.mods %>%
#   left_join(cflo_annots, by="gene_name") %>% head()
#   write.csv(.,
#             paste0(path_to_repo,
#                    "/results/WGCNA/cflo/cflo_heads_control_module_identity_new_labels_annots.csv"),
#             row.names = F)

3.1 Load genes of interest

# load the list of all genes of interest
load(file = paste0(path_to_repo,"/results/genes_of_interest/goi_list.RData"))

sapply(goi.list[[1]][[1]], length)
## pbar_ld_24h pbar_ld_12h pbar_ld_08h 
##        1536         179         354

3.4 Where are my genes of interest located?

make your gene sets (character vectors)

## Rhythmic genes
rhy24.ld <- goi.list[[1]][[1]][[1]]
rhy24.dd <- goi.list[[1]][[2]][[1]]

## Ultradian genes
rhy12.ld <- goi.list[[1]][[1]][[2]]
rhy12.dd <- goi.list[[1]][[2]][[2]]
rhy08.ld <- goi.list[[1]][[1]][[3]]
rhy08.dd <- goi.list[[1]][[2]][[3]]

# ## DRGs
# for.brain.head.rhy24.cluster1 <- goi.list[[2]]
# for24.nur8 <- goi.list[[6]][[1]]

Comparison

Modules vs. Rhythmic genes

writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
##   C1   C2   C3   C4   C5   C6   C7   C8   C9  C10  C11 
##  810 1855  246  200 2015   69  233   99  143 2225 2030
## LIST TWO - rhythmic genes
list2 <- list(rhy24.ld,
              rhy24.dd,
              
              rhy12.ld,
              rhy12.dd,
              
              rhy08.ld,
              rhy08.dd
              )
writeLines("List of interesting genes #2
----------------------------
Rhythmic genes in control Cflo heads")
## List of interesting genes #2
## ----------------------------
## Rhythmic genes in control Cflo heads
names(list2) <- c("rhy24-Pbar-LD",
                  "rhy24-Pbar-DD",
                  
                  "rhy12-Pbar-LD",
                  "rhy12-Pbar-DD",
                  
                  "rhy08-Pbar-LD",
                  "rhy08-Pbar-DD"
                  )
sapply(list2, length)
## rhy24-Pbar-LD rhy24-Pbar-DD rhy12-Pbar-LD rhy12-Pbar-DD rhy08-Pbar-LD 
##          1536          1547           179           341           354 
## rhy08-Pbar-DD 
##           717
## CHECK FOR OVERLAP

## make a GOM object
gom.1v2 <- newGOM(list2, list1,
       genome.size = nGenes)
# png(paste0(path_to_repo, "/results/figures/",
#            "02_pogo_GCN/",
#            sample.name[1],"_gom_1v2.png"),
#     width = 35, height = 15, units = "cm", res = 300)
drawHeatmap(gom.1v2,
              adj.p=T,
              cutoff=0.05,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "orange")

# trash <- dev.off()

 # writeLines("How many genes exactly are overlapping between the pairwise comparisons")
# getMatrix(gom.1v4, name = "intersection") %>% t()

writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules

PLOT MODULE EXPRESSION

Daily tempo

which.modules <- c("C1","C2",
                   "C10","C11")
module.plots <- list()
for (i in 1: length(which.modules)) {
  
  writeLines(paste0("Plotting expression for module ",
                    which.modules[i]))
  module.plots[[i]] <-
    module_genes[[which.modules[[i]]]] %>%
    stacked.zplot(bg.alpha = .08, alpha = 1) %>% 
    multi.plot(rows = 2, cols = 1)

}
## Plotting expression for module C1

## Plotting expression for module C2

## Plotting expression for module C10

## Plotting expression for module C11

module.plots <- module.plots |> 
  setNames(which.modules)

# png(paste0(path_to_repo, "/results/figures/", sample.name[1],"module_exp.png"),
#     width = 25, height = 40, units = "cm", res = 400)
# module.plots %>%
#   multi.plot(rows = 3, cols = 2)
# trash <- dev.off()

Amplitude

paste0("C", 1:11) |>
  purrr::map(
    function(m) {
      module_genes[[m]] |> 
        amplitude.plot(
          stats = FALSE
        ) +
        labs(
          title = m
        ) +
        scale_y_continuous(
          n.breaks = 3
        ) +
        theme(
          axis.title.x = element_blank(),
          axis.title.y = element_blank()
        )
    }
  ) |> 
  multi.plot(rows = 3, cols = 4)

## TableGrob (3 x 4) "arrange": 11 grobs
##     z     cells    name           grob
## 1   1 (1-1,1-1) arrange gtable[layout]
## 2   2 (1-1,2-2) arrange gtable[layout]
## 3   3 (1-1,3-3) arrange gtable[layout]
## 4   4 (1-1,4-4) arrange gtable[layout]
## 5   5 (2-2,1-1) arrange gtable[layout]
## 6   6 (2-2,2-2) arrange gtable[layout]
## 7   7 (2-2,3-3) arrange gtable[layout]
## 8   8 (2-2,4-4) arrange gtable[layout]
## 9   9 (3-3,1-1) arrange gtable[layout]
## 10 10 (3-3,2-2) arrange gtable[layout]
## 11 11 (3-3,3-3) arrange gtable[layout]

Avg daily exp


Step 5: Network statistics

From WGCNA-tutorial

5.1 Intramodular connectivity

“We begin by calculating the intramodular connectivity for each gene. (In network literature, connectivity is often referred to as”degree”.) The function intramodularConnectivity computes:

  • the whole network connectivity kTotal,
  • the within (intra)module connectivity kWithin,
  • the extra-modular connectivity kOut=kTotal-kWithin, and
  • the difference of the intra- and extra-modular connectivities kDiff=kIn-kOut=2*kIN-kTotal
# From what I can tell, colorh1 in the tutorial refers to moduleColors
colorh1 <- moduleColors

# Calculate the connectivities of the genes
Alldegrees1=intramodularConnectivity(adjMat = adj_matrix, colors = colorh1)
Alldegrees1 <- 
  Alldegrees1 %>% 
    rownames_to_column("gene_name") %>% 
    mutate_at(vars(starts_with("k")), 
              function(x){
                round(x,2)
                })
head(Alldegrees1)
##      gene_name kTotal kWithin  kOut  kDiff
## 1 LOC105421953  50.66   42.40  8.27  34.13
## 2 LOC105421955  29.61   17.94 11.67   6.27
## 3 LOC105421956  30.19   15.89 14.31   1.58
## 4 LOC105421958 137.18  125.42 11.76 113.66
## 5 LOC105421959  29.29   15.82 13.47   2.35
## 6 LOC105421960  45.23   31.11 14.12  16.98

5.2 (Module Membership)

Generalizing intramodular connectivity for all genes

“The intramodular connectivity measure is only defined for the genes inside a given module. But in practice it can be very important to measure how connected a given genes is to biologically interesting modules. Toward this end, we define a module eigengene-based connectivity measure for each gene as the correlation between a the gene expression and the module eigengene.

Specifically,

kMEbrown(i) = cor(xi,MEbrown)

where xi is the gene expression profile of gene i and MEbrown is the module eigengene of the brown module. Note: that the definition does not require that gene i is a member of the brown module. We define a data frame containing the module membership (MM) values for each module. In the past, we called the module membership values kME.”

Calculate the signed kME and display the first few rows/columns.

datKME=signedKME(datExpr, mergedMEs, outputColumnName="")
# Display the first few rows of the data frame
datKME[1:6,1:6]
##                    black lightyellow   darkgrey     grey60       blue
## LOC105421953  0.44222741 -0.25047902  0.3573258 0.39778640  0.7453533
## LOC105421955 -0.60646385 -0.56298796  0.2457627 0.55570755 -0.1096598
## LOC105421956  0.66900676  0.11698937  0.1002657 0.04285703  0.6500437
## LOC105421958  0.70323811 -0.09883344 -0.0593568 0.24758477  0.8761597
## LOC105421959 -0.01839094 -0.42274637  0.1159919 0.49110237  0.3959603
## LOC105421960 -0.65970317 -0.59076495  0.6733935 0.57430514 -0.1134845
##                   violet
## LOC105421953  0.53984393
## LOC105421955 -0.01133061
## LOC105421956  0.39863896
## LOC105421958  0.57534334
## LOC105421959  0.52640431
## LOC105421960 -0.09532808

Let’s plot the connectivity vs. module-membership values for all identified clusters.

# colorlevels=as.character(module_ids$old_labels)
# # sizeGrWindow(9,6)
# 
# png(paste0(path_to_repo, "/results/figures/", sample.name[1],"/", script.name, "/",
#            sample.name[1],"_connectivity_kWithin_v_modulemembership.png"), 
#     width = 30, height = 40, units = "cm", res = 300)
# 
# par(mfrow=c(as.integer(0.5+length(colorlevels)/4), 4))
# par(mar = c(3,3,2,2))
# # par(mar = c(4,5,3,1))
# for (i in c(1:length(colorlevels))) {
# whichmodule=colorlevels[[i]];
# restrict1 = (colorh1==whichmodule);
# verboseScatterplot(Alldegrees1$kWithin[restrict1],
#                    datKME[restrict1, whichmodule],
#                     col= rgb(red=169, green=169, blue=169, alpha = 80, maxColorValue = 255),
#                     # bg = colorh1[restrict1],
#                     bg = rgb(red=169, green=169, blue=169, alpha = 30, maxColorValue = 255),
#                     pch = 21,
#                     lwd = 1.5,
#                     cex = 1.2,
#                    
#                     # main=whichmodule,
#                     main=module_ids %>% filter(old_labels==whichmodule) %>% pull(new_labels) %>% as.character(),
#                     xlab = "Connectivity_kWithin", ylab = "Module-membership", abline = TRUE)
# }
# 
# trash <- dev.off()

Plotting the mean (± 95% CI) connectivity of genes in different modules

# 
# # Make the summary function (borrowed)
# summarySE <- function(data=NULL, measurevar, groupvars=NULL, na.rm=FALSE,
#                       conf.interval=.95, .drop=TRUE) {
#   
#   # New version of length which can handle NA's: if na.rm==T, don't count them
#   length2 <- function (x, na.rm=FALSE) {
#     if (na.rm) sum(!is.na(x))
#     else  length(x)
#   }
#   
#   # This does the summary. For each group's data frame, return a vector with
#   # N, mean, and sd
#   datac <- plyr::ddply(data, groupvars, .drop=.drop,
#                        .fun = function(xx, col) {
#                          c(N    = length2(xx[[col]], na.rm=na.rm),
#                            mean = mean(xx[[col]], na.rm=na.rm),
#                            sd   = stats::sd(xx[[col]], na.rm=na.rm)
#                          )
#                        },
#                        measurevar
#   )
#   
#   # Rename the "mean" column    
#   datac <- plyr::rename(datac, c("mean" = measurevar))
#   
#   datac$se <- datac$sd / sqrt(as.numeric(datac$N))  # Calculate standard error of the mean
#   
#   # Confidence interval multiplier for standard error
#   # Calculate t-statistic for confidence interval: 
#   # e.g., if conf.interval is .95, use .975 (above/below), and use df=N-1
#   ciMult <- qt(conf.interval/2 + .5, datac$N-1)
#   datac$ci <- datac$se * ciMult
#   
#   return(datac)
# }

pd <- position_dodge(0.1)

Alldegrees1 %>% 
  # rownames_to_column("gene_name") %>% 
  left_join(pbar.ld.mods, by="gene_name") %>%
  
  # PLOT FROM RAW DATA
  mutate(module_identity = factor(module_identity, 
                                  levels = paste0("C",1:nrow(module_ids)))) %>%
  
  # ggplot(aes(x=module_identity, y=kTotal)) +
  # # ggplot(aes(x=module_identity, y=kWithin)) +
  # 
  #   geom_hline(yintercept = 45, col="darkgrey", alpha=0.8) +
  #   geom_hline(yintercept = 30, col="darkgrey", alpha=0.8) +
  #   # geom_hline(yintercept = 75, col="darkgrey", alpha=0.8) +
  # 
  #   geom_boxplot(fill="lightgrey",
  #                alpha=0.6,
  #                outlier.color = "grey60",
  #                position = "dodge2") +
  #   
  #   theme_Publication() +
  #   scale_colour_Publication() +
  #   xlab("") +
  #   ggtitle("") +
  # 
  #   ylab("Total connectivity") +
  #   # ylab("Intramodular connectivity") +
  # 
  #   theme(text = element_text(size = 20, colour = 'black'),
  #         legend.position = "none",
  #         axis.line.y = element_line(colour = "transparent",size=1)) +
  #   coord_flip()
  
  
  # SUMMARIZE DATA AND PLOT
  # summarySE(., measurevar = "kWithin", groupvars = "module_identity") %>%
  summarySE(., measurevar = "kTotal", groupvars = "module_identity") %>%

  # make the value column
  mutate(value=kTotal) %>%
  mutate(module_identity = factor(module_identity,
                                  levels = paste0("C",1:nrow(module_ids)))) %>%

  # Plot
  ggplot(aes(x=((module_identity)), y=value)) +

    theme_Publication() +
    scale_colour_Publication() +
    xlab("") +
    ylab("Total connectivity") +
    ggtitle("") +
    #
    # theme(axis.title.x = element_blank(),
    #       axis.title.y = element_blank(),
    #       legend.position='none',
    #       # legend.title = element_text(size = 15, colour = 'black'),
    #       legend.text = element_blank()) +
    # ## center align the title
    # theme(plot.title = element_blank()) +
    #
    # scale_x_continuous(breaks = c(0,12,24)) +
    #scale_y_continuous(limits = c(0,40)) +

    # theme(panel.grid.major.x = element_line(colour = "#808080", size=0.1),
    #       panel.grid.major.y = element_line(colour = "#808080", size=0.2)) +
    # ## if you need highlighting parts of the graph
    # geom_rect(aes(xmin = 11.8, xmax = 23.8, ymin = -Inf, ymax = Inf),
    #           fill = "lightgrey", alpha = 0.02, color=NA) +

    # geom_line(position=pd,
    #           col="#F2CB05", size=2, alpha=1) + # total
    #           # col="#BF0404", size=2, alpha=1) + # FS
    #           # col="#0FBF67", size=2, alpha=1) + # FA

    ## Add error bar here
    geom_errorbar(aes(ymin=value-ci, ymax=value+ci),
                  width=.4, position=pd, col="black", alpha = 0.7) +

    # Add the points on top of the error bars
    geom_point(position=pd, size=2.5,
               col="black", fill="grey60",
               show.legend = F, color="black", pch=21, alpha=0.9) +

    #facet_wrap(~Phase, nrow=2)
    # scale_fill_manual(values = c("black","#F20505","#F2CB05","#0FBF67")) +
    # scale_color_manual(values=c("#F20505","#F5D736","#0FBF67")) +
    theme(text = element_text(size = 20, colour = 'black'),
          legend.position = "none") +

    coord_flip()

    # [as per reviewer request]
    # forcing the y-axis to start at 0
    # expand_limits(x = 0, y = 0)

5.3 Betweenness centrality

Next, we can look at betweenness centrality:

# network.complete <- graph_from_adjacency_matrix(adj_matrix,
#                                               mode = "undirected",
#                                               weighted = T)
# 
# V(network.complete) %>% length()
# E(network.complete) %>% length()

Step 6: Pogo genes of interest

6.1 Response to humidity

  • Read data from Friedman et al. 2020
  • Set correlation threshold | cor(collective behav, geneexpression) = [-0.6,0.5]
    • we selected the top 5% as the positively correlated genes
    • similarly, bottom 5% as the negetively
    • Note, we tried using higher thresholds, but the results do not change vastly.
# Read 2020_Friedman_et_al data -------------------------------------------

df.2020 <- read.csv(paste0(path_to_pogodata, "2020_friedman_Pbar_TPM_TraitCorr_dNdS.csv"), 
                    header = T, stringsAsFactors = F, na.strings = c(""," ",".", "NA")) %>% as_tibble()

# note, it is unclear what the columns actually mean, 
# but, below find an example of what we could do.
# Assumption: "humidity_cor" represents the correlation of gene expression to humidity changes

## Keep relevant columns
df.2020 <- 
  df.2020 %>% 
  select(pogo_gene = LOC,
         humidity_cor:da_5ht_cor_colony_mean)

## Deciding on the threshold
## the first and the third quartiles

# Set humidity-expression correlation threshold
h.threshold <- quantile(df.2020$humidity_cor, prob=c(.05,.95), type=1)

# obtain pogo genes that satisfied the threshold
h.genes.pos <- 
  df.2020 %>% 
  as_tibble() %>% 
  select(pogo_gene,
         humidity_cor) %>% 
  na.omit() %>% 
  filter(humidity_cor > h.threshold[2]) %>% 
  pull(pogo_gene)
h.genes.neg <- 
  df.2020 %>% 
  as_tibble() %>% 
  select(pogo_gene,
         humidity_cor) %>% 
  na.omit() %>% 
  filter(humidity_cor < h.threshold[1]) %>% 
  pull(pogo_gene)

The top and bottom 5% gaves us a set of 516 positively and a set of 516 negatively correlated pogo genes.

4.4 Re-Annotate the network

Now, let’s see which modules, if any, in the cflo brain GCN do these genes reside?

The names have the following meanings:

  • pogo-rH-pos = Pogonomyrmex genes whose expression show a positive correlation with collective response to abiotic changes
  • pogo-rH-neg = Pogonomyrmex genes whose expression show a negative correlation with collective response to abiotic changes

Modules vs. Rhythmic|BehavioralPlasticity genes

writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
##   C1   C2   C3   C4   C5   C6   C7   C8   C9  C10  C11 
##  810 1855  246  200 2015   69  233   99  143 2225 2030
## LIST TWO - rhythmic genes
list3 <- list(rhy24.ld,
              rhy24.dd,
              
              rhy12.ld,
              rhy12.dd,
              
              rhy08.ld,
              rhy08.dd,
              
              h.genes.pos,
              h.genes.neg
              )
writeLines("List of interesting genes #2
----------------------------
Rhythmic genes in control Cflo heads")
## List of interesting genes #2
## ----------------------------
## Rhythmic genes in control Cflo heads
names(list3) <- c("rhy24-Pbar-LD",
                  "rhy24-Pbar-DD",
                  
                  "rhy12-Pbar-LD",
                  "rhy12-Pbar-DD",
                  
                  "rhy08-Pbar-LD",
                  "rhy08-Pbar-DD",
                  
                  "pogo-rH-pos", 
                  "pogo-rH-neg"
                  )
sapply(list3, length)
## rhy24-Pbar-LD rhy24-Pbar-DD rhy12-Pbar-LD rhy12-Pbar-DD rhy08-Pbar-LD 
##          1536          1547           179           341           354 
## rhy08-Pbar-DD   pogo-rH-pos   pogo-rH-neg 
##           717           516           516
writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules
## CHECK FOR OVERLAP

## make a GOM object
gom.1v3 <- newGOM(list3, list1,
       genome.size = nGenes)
# png(paste0(path_to_repo, "/results/figures/",
#            "02_pogo_GCN/",
#            sample.name[1],"_gom_1v2.png"),
#     width = 35, height = 15, units = "cm", res = 300)
drawHeatmap(gom.1v3,
              adj.p=T,
              cutoff=0.05,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "orange")

# trash <- dev.off()

writeLines("How many genes exactly are overlapping between the pairwise comparisons")
## How many genes exactly are overlapping between the pairwise comparisons
getMatrix(gom.1v3, name = "intersection") %>% t()
##     rhy24-Pbar-LD rhy24-Pbar-DD rhy12-Pbar-LD rhy12-Pbar-DD rhy08-Pbar-LD
## C1            139           168            11             8             8
## C2            720           510            17            20            34
## C3             68            19             0             9             3
## C4             54            17             0             1             5
## C5             30           246            22            54            44
## C6              4             4             0             3             0
## C7             14            42             2             7             2
## C8             23             7             0             2             2
## C9              0            22             0            17             2
## C10            18           222            21            65            24
## C11           354           134            28            43            60
##     rhy08-Pbar-DD pogo-rH-pos pogo-rH-neg
## C1             74         134           5
## C2             47         202          20
## C3             17           3          18
## C4              9           4           9
## C5            123         108          36
## C6              6           2           5
## C7             12           0          17
## C8              2           2           4
## C9              2          13           0
## C10           135          18         179
## C11           144          13         202

Step 7: Cflo genes of interest

7.1 Behavioral plasticity

  • Read data from Das and de Bekker 2022
  • For a given set of Cflo genes of interest, find orthologs in Pbar
  • Check to see where the Cflo genes of interest are located in the Pbar GCN
pogo.for24 <- 
        cflo.for24 %>%      # 3569 cflo genes with 24h rhythms in forager brains
        cflo_to_pogo()      # 3265 cflo genes have a one-to-one pbar ortholog

pogo.for24nur8 <-
        cflo.for24nur8 %>%  # 291 cflo genes with 24h rhythms in forager brains, but 8h in nurses
        cflo_to_pogo()      # 276 cflo genes have a one-to-one pbar ortholog

The top and bottom 5% gaves us a set of 516 positively and a set of 516 negatively correlated pogo genes.

4.4 Re-Annotate the network

Now, let’s see which modules, if any, in the cflo brain GCN do these genes reside?

The names have the following meanings:

  • pogo-rH-pos = Pogonomyrmex genes whose expression show a positive correlation with collective response to abiotic changes

  • pogo-rH-neg = Pogonomyrmex genes whose expression show a negative correlation with collective response to abiotic changes

  • for24-Cflo = P. barbatus orthologs of Cflo genes that show 24h rhythms in forager brains

  • for24nur8-Cflo = P. barbatus orthologs of Cflo genes that show 24h rhythms in forager brains, but 8h oscillations in nurse brains

Modules vs. Rhythmic|BehavioralPlasticity genes - V2

writeLines("#####################################################
How many genes are in each of my geneset of interest?
#####################################################")
## #####################################################
## How many genes are in each of my geneset of interest?
## #####################################################
## MAKE YOUR LIST OF GENES OF INTEREST ##

# LIST ONE - WGCNA modules
list1 <- module_genes
writeLines("List of interesting genes #1
----------------------------
Genes in each of the identified gene-clusters or modules")
## List of interesting genes #1
## ----------------------------
## Genes in each of the identified gene-clusters or modules
sapply(list1, length)
##   C1   C2   C3   C4   C5   C6   C7   C8   C9  C10  C11 
##  810 1855  246  200 2015   69  233   99  143 2225 2030
## LIST TWO - rhythmic genes
list4 <- list(rhy24.ld,
              rhy24.dd,
              
              h.genes.pos,
              h.genes.neg,
              
              pogo.for24,
              pogo.for24nur8
              )
writeLines("List of interesting genes #2
----------------------------
Rhythmic genes in control Cflo heads")
## List of interesting genes #2
## ----------------------------
## Rhythmic genes in control Cflo heads
names(list4) <- c("rhy24-Pbar-LD",
                  "rhy24-Pbar-DD",
                  
                  "pogo-rH-pos", 
                  "pogo-rH-neg",
                  
                  "for24-Cflo", 
                  "for24nur8-Cflo"
                  )
sapply(list4, length)
##  rhy24-Pbar-LD  rhy24-Pbar-DD    pogo-rH-pos    pogo-rH-neg     for24-Cflo 
##           1536           1547            516            516           3265 
## for24nur8-Cflo 
##            276
writeLines("Visualizing the significant overlaps between your lists of interesting genes and the identified modules")
## Visualizing the significant overlaps between your lists of interesting genes and the identified modules
## CHECK FOR OVERLAP

## make a GOM object
gom.1v4 <- newGOM(list4, list1,
       genome.size = nGenes)
# png(paste0(path_to_repo, "/results/figures/",
#            "02_pogo_GCN/",
#            sample.name[1],"_gom_1v2.png"),
#     width = 35, height = 15, units = "cm", res = 300)
drawHeatmap(gom.1v4,
              adj.p=T,
              cutoff=0.05,
              what="odds.ratio",
              # what="Jaccard",
              log.scale = T,
              note.col = "orange")

# trash <- dev.off()

writeLines("How many genes exactly are overlapping between the pairwise comparisons")
## How many genes exactly are overlapping between the pairwise comparisons
getMatrix(gom.1v4, name = "intersection") %>% t()
##     rhy24-Pbar-LD rhy24-Pbar-DD pogo-rH-pos pogo-rH-neg for24-Cflo
## C1            139           168         134           5        315
## C2            720           510         202          20        633
## C3             68            19           3          18         71
## C4             54            17           4           9         54
## C5             30           246         108          36        501
## C6              4             4           2           5         23
## C7             14            42           0          17         65
## C8             23             7           2           4         20
## C9              0            22          13           0         21
## C10            18           222          18         179        751
## C11           354           134          13         202        765
##     for24nur8-Cflo
## C1              59
## C2             101
## C3               2
## C4               2
## C5              52
## C6               1
## C7               2
## C8               0
## C9               3
## C10             28
## C11             23
writeLines("What are the corresponding Odds-ratio?")
## What are the corresponding Odds-ratio?
getMatrix(gom.1v4, name = "odds.ratio") %>% t() %>% round(2)
##     rhy24-Pbar-LD rhy24-Pbar-DD pogo-rH-pos pogo-rH-neg for24-Cflo
## C1           1.14          1.47        4.53        0.10       1.33
## C2           5.64          2.57        3.02        0.17       1.07
## C3           2.14          0.45        0.22        1.46       0.82
## C4           2.06          0.50        0.37        0.86       0.75
## C5           0.06          0.71        1.04        0.28       0.62
## C6           0.33          0.33        0.54        1.43       1.02
## C7           0.34          1.20        0.00        1.45       0.78
## C8           1.66          0.41        0.37        0.77       0.51
## C9           0.00          0.98        1.84        0.00       0.35
## C10          0.03          0.53        0.12        1.91       1.05
## C11          1.20          0.32        0.09        2.67       1.30
##     for24nur8-Cflo
## C1            3.22
## C2            2.60
## C3            0.28
## C4            0.35
## C5            0.91
## C6            0.51
## C7            0.30
## C8            0.00
## C9            0.75
## C10           0.38
## C11           0.35
writeLines("What are the corresponding p-values (Fisher's test)?")
## What are the corresponding p-values (Fisher's test)?
getMatrix(gom.1v4, name = "pval") %>% t() %>% round(2)
##     rhy24-Pbar-LD rhy24-Pbar-DD pogo-rH-pos pogo-rH-neg for24-Cflo
## C1           0.09          0.00        0.00        1.00       0.00
## C2           0.00          0.00        0.00        1.00       0.11
## C3           0.00          1.00        1.00        0.09       0.93
## C4           0.00          1.00        0.99        0.72       0.97
## C5           1.00          1.00        0.38        1.00       1.00
## C6           1.00          1.00        0.88        0.29       0.51
## C7           1.00          0.17        1.00        0.10       0.96
## C8           0.03          1.00        0.97        0.76       1.00
## C9           1.00          0.56        0.04        1.00       1.00
## C10          1.00          1.00        1.00        0.00       0.17
## C11          0.00          1.00        1.00        0.00       0.00
##     for24nur8-Cflo
## C1            0.00
## C2            0.00
## C3            0.99
## C4            0.98
## C5            0.75
## C6            0.86
## C7            0.99
## C8            1.00
## C9            0.76
## C10           1.00
## C11           1.00

Plot temporal patterns

h.genes.pos %>% 
  stacked.zplot(bg.alpha = .08, alpha = 1) %>% 
  multi.plot(rows = 2, cols = 1)

## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]
h.genes.neg %>% 
  stacked.zplot(bg.alpha = .08, alpha = 1) %>% 
  multi.plot(rows = 2, cols = 1)

## TableGrob (2 x 1) "arrange": 2 grobs
##   z     cells    name           grob
## 1 1 (1-1,1-1) arrange gtable[layout]
## 2 2 (2-2,1-1) arrange gtable[layout]

EXTRAS

Step 5: Export the results

There are several things we could output from our analyses, we decided to report the following in the supplementary file:

  • cluster identity
  • total connectivity
  • pfam annotations enriched in the module

Get pfam enrichments for each module

# module_pfams <- list()
# 
# bg.genes <- dat[[1]] ## all genes used to make the network
# which.test <- "pfams"
# 
# for (i in 1:length(which.labels)) {
#   
#   # get name of the module
#   m <- which.labels[[i]]
#   
#   # save the enrichment results
#   module_pfams[[i]] <- 
#     module_genes[[m]] %>% 
#     check_enrichment(.,
#                      bg = dat[[1]],
#                      what = which.test,
#                      org = "cflo",
#                      clean = T,
#                      expand = T,
#                      plot = F)
# }
# 
# ## Now clean it up
# sapply(module_pfams, nrow)
# c <- 0
# for (i in 1:length(module_pfams)) {
#   
#   if(is.null(nrow(module_pfams[[i]]))) {
#     paste(which.labels[[i]],"is null") %>% print()
#   } else if(nrow(module_pfams[[i]])==0) {
#     paste(which.labels[[i]], "is an empty tibble") %>% print()
#   } else {
#     
#     if(c==0) {
#     c <- c+1
#     module.pfams <- module_pfams[[i]]
#     } else {
#       module.pfams <- rbind(module.pfams, module_pfams[[i]])
#     }
#   }
# }
# 
# ## change the name of the column
# module.pfams <- 
#   module.pfams %>% 
#   select(gene_name, enriched_in_module=annot_desc) %>% 
#   filter(enriched_in_module!="no_desc")
# 
# # check the output dataframe
# module.pfams %>% head()

Let’s make the file

# results.gcn <-
#   pbar.ld.mods %>% 
#   
#   pull(gene_name) %>% 
#   
#   ## rhythmicity data
#   TC7_annotator() %>% 
#   select(gene_name, everything()) %>% 
#   
#   ## cluster identity
#   left_join(pbar.ld.mods, by="gene_name") %>% 
#   
#   ## add total connectivity data
#   left_join(Alldegrees1 %>% select(gene_name,kTotal), by="gene_name") %>% 
#   
#   ## add the enriched pfam
#   left_join(module.pfams, by="gene_name") %>% 
#   
#   # Geneset: "for.brain.head.rhy24.cluster1" | DRGs (disease-associated)
#   mutate(rhy_brain_head_cluster1 = ifelse(gene_name %in% for.brain.head.rhy24.cluster1, "yes", "no")) %>% 
#   # Geneset: "for24.nur8" | DRGs (caste-associated)
#   mutate(for24_nur8 = ifelse(gene_name %in% for24.nur8, "yes", "no")) %>% 
#   
#   ## Geneset: DEGs
#   mutate(ocflo_UP = ifelse(gene_name %in% ocflo.up, "yes", "no")) %>% 
#   mutate(ocflo_DOWN = ifelse(gene_name %in% ocflo.down, "yes", "no")) %>% 
#   mutate(beau_UP = ifelse(gene_name %in% beau.up, "yes", "no")) %>% 
#   mutate(beau_DOWN = ifelse(gene_name %in% beau.down, "yes", "no")) %>% 
#   
#   ## order the columns and rows
#   select(module_identity, kTotal, enriched_in_module, 
#          gene_name, gene_desc = blast_annotation, 
#          rhy_brain_head_cluster1, for24_nur8,
#          ocflo_UP, ocflo_DOWN, beau_UP, beau_DOWN,
#          everything()) %>% 
#   arrange(desc(module_identity), enriched_in_module, desc(kTotal)) 
#   
#   
# # ## export it
# # write.csv(results.gcn,
# #           file = paste0(path_to_repo, "/results/00_supplementary_files/07_Cflo_forager_head_GCN_results.csv"),
# #           row.names = F)


RANDOMS

R1 Plotting the daily expression of core clock, clock modulators, and clock controlled genes

# cgs.18 <- c("LOC105252466")
# cgs.19 <- c("LOC105250191", "LOC105256631")
# cgs.21 <- c("LOC105256454", "LOC105257275", "LOC105255207", "LOC105248529",
#             "LOC105253861", "LOC105257836")
# cgs.22 <- c("LOC105252510", "LOC105258655", "LOC105251553", "LOC105256952",
#             "LOC105252917", "LOC105249574", "LOC105259208")
# 
# cgs <- list(cgs.18, cgs.19, cgs.21, cgs.22)
# 
# plots <- list()
# for (i in 1:length(cgs)){
#   
# plots[[i]] <-
#   cgs[[i]] %>% 
#   stacked.zplot_tc7(bg.alpha = 0.4) %>% 
#   multi.plot(rows = 1, cols = 3)
# }
# 
# # png(paste0(path_to_repo, "/results/figures/", sample.name[1],"/",
# #            "clock_genes_daily_exp.png"), 
# #     width = 40, height = 20, units = "cm", res = 300)
# # plots %>% 
# #   multi.plot(rows = 1, cols = 4)
# # dev.off()